from IPython.display import Image
Training Data Description: Historic sales at Store-Day level for about two years for a retail giant, for more than 1000 stores. Also, other sale influencers like, whether on a particular day the store was fully open or closed for renovation, holiday and special event details, are also provided.
Image('1566547170_cap 3.png')
#importing the necessary packages
import pandas as pd
import pandas_profiling
df = pd.read_csv('train_data.csv')
pandas_profiling.ProfileReport(df)
Observation: Sales is highly and positively correlated with Customers, Open and Promo. Low positive correlation with schoolholiday and low negative correlation with stateholiday.
from sklearn.preprocessing import LabelEncoder
#Label encode DayOfWeek and StateHoliday
encode = LabelEncoder()
df['StateHoliday'] = df['StateHoliday'] .replace([0], '0')
df['DayOfWeek'] = encode.fit_transform(df['DayOfWeek'])
#df['StateHoliday'] = encode.fit_transform(df['StateHoliday'])
print(df['DayOfWeek'].unique(),df['DayOfWeek'].dtype)
df['StateHoliday'].unique()
Temp = pd.get_dummies(df['StateHoliday'])
Temp.head()
df = pd.concat([df, Temp], axis=1)
df.rename(columns = {'0':'State0'}, inplace = True)
df.rename(columns = {'a':'Statea'}, inplace = True)
df.rename(columns = {'b':'Stateb'}, inplace = True)
df.rename(columns = {'c':'Statec'}, inplace = True)
df
Temp = pd.get_dummies(df['DayOfWeek'])
Temp.head()
df = pd.concat([df, Temp], axis=1)
df.rename(columns = {0:'sun'}, inplace = True)
df.rename(columns = {1:'mon'}, inplace = True)
df.rename(columns = {2:'tue'}, inplace = True)
df.rename(columns = {3:'wed'}, inplace = True)
df.rename(columns = {4:'thu'}, inplace = True)
df.rename(columns = {5:'fri'}, inplace = True)
df.rename(columns = {6:'sat'}, inplace = True)
df
df=df.drop(['StateHoliday','DayOfWeek'], axis=1)
df
X = df.drop(['Sales', 'Date'], axis=1)
Y = df['Sales']
from sklearn import model_selection
xtrain, xtest, ytrain, ytest = model_selection.train_test_split(X,Y,
test_size=0.3,
random_state = 1)
from sklearn.linear_model import LinearRegression
from math import sqrt
from sklearn.metrics import r2_score, mean_squared_error
import pickle # allows for saving models to hard drive
lg = LinearRegression()
lg.fit(xtrain,ytrain)
print(lg.intercept_, lg.coef_)
import math
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
def mean_absolute_percentage_error(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
y_true = [1,2,3,4]
y_pred = [1,2,5,4]
mean_absolute_percentage_error(y_true,y_pred)
lrpredictions = lg.predict(xtest)
#print(sqrt(mean_squared_error(ytrain, lg.predict(xtrain))))
print(sqrt(mean_squared_error(ytest, lg.predict(xtest))))
r2score = r2_score(ytest, lrpredictions)
print('R2 Value/Coeff of Det: {}'.format(r2score))
print('Mean Absolute Error: {}'.format(mean_absolute_error(ytest, lrpredictions)))
#mape = mean_absolute_percentage_error(ytest.to_numpy(), predictions)
#print('MAPE: {}'.format(mape))
mse = mean_squared_error(ytest, lrpredictions)
print("MSE: {}".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Square Error: {}".format(rmse))
Observeration: Lasso seems to perform better based on r2 value
df_store1 = df[df['Store']==1]
pandas_profiling.ProfileReport(df)
df_store1
X = df_store1.drop(['Sales', 'Date'], axis=1)
Y = df_store1['Sales']
from sklearn import model_selection
xtrain, xtest, ytrain, ytest = model_selection.train_test_split(X,Y,
test_size=0.3,
random_state = 1)
lg = LinearRegression()
lg.fit(xtrain,ytrain)
predictions = lg.predict(xtest)
print(sqrt(mean_squared_error(ytrain, lassoreg.predict(xtrain))))
print(sqrt(mean_squared_error(ytest, lassoreg.predict(xtest))))
r2score = r2_score(ytest, predictions)
print('R2 Value/Coeff of Det: {}'.format(r2score))
print('Mean Absolute Error: {}'.format(mean_absolute_error(ytest, predictions)))
mse = mean_squared_error(ytest, predictions)
print("MSE: {}".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Square Error: {}".format(rmse))
Observation: R2 value is increased and errors have reduced compared to model trained on all stores
df_store2 = df[df['Store']==2]
X = df_store2.drop(['Sales', 'Date'], axis=1)
Y = df_store2['Sales']
from sklearn import model_selection
xtrain, xtest, ytrain, ytest = model_selection.train_test_split(X,Y,
test_size=0.3,
random_state = 1)
lg = LinearRegression()
lg.fit(xtrain,ytrain)
predictions = lg.predict(xtest)
print(sqrt(mean_squared_error(ytrain, lassoreg.predict(xtrain))))
print(sqrt(mean_squared_error(ytest, lassoreg.predict(xtest))))
r2score = r2_score(ytest, predictions)
print('R2 Value/Coeff of Det: {}'.format(r2score))
print('Mean Absolute Error: {}'.format(mean_absolute_error(ytest, predictions)))
mse = mean_squared_error(ytest, predictions)
print("MSE: {}".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Square Error: {}".format(rmse))
Observation: R2 value is increased and errors have reduced compared to model trained on all stores
#Carry out test train split again on all stores data since it is overwritten by store wise data
X = df.drop(['Sales', 'Date'], axis=1)
Y = df['Sales']
from sklearn import model_selection
xtrain, xtest, ytrain, ytest = model_selection.train_test_split(X,Y,
test_size=0.3,
random_state = 1)
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=3, random_state=0)
regr = regr.fit(xtrain, ytrain)
predictions = regr.predict(xtest)
print(sqrt(mean_squared_error(ytest, predictions)))
r2score = r2_score(ytest, predictions)
print('R2 Value/Coeff of Det: {}'.format(r2score))
print('Mean Absolute Error: {}'.format(mean_absolute_error(ytest, predictions)))
mse = mean_squared_error(ytest, predictions)
print("MSE: {}".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Square Error: {}".format(rmse))
print(regr.score(xtrain, ytrain, sample_weight=None))
regr.feature_importances_
Observation: R2 has decreased from 0.8568175798803241 to 0.817251475681456 after using bagging ensemble
from sklearn.ensemble import AdaBoostRegressor
regr = AdaBoostRegressor(random_state=0, n_estimators=100)
regr.fit(xtrain, ytrain)
AdaBoostRegressor(n_estimators=100, random_state=0)
predictions = regr.predict(xtest)
regr.score(xtrain, ytrain)
r2score = r2_score(ytest, predictions)
print('R2 Value/Coeff of Det: {}'.format(r2score))
print('Mean Absolute Error: {}'.format(mean_absolute_error(ytest, predictions)))
mse = mean_squared_error(ytest, predictions)
print("MSE: {}".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Square Error: {}".format(rmse))
print(regr.score(xtrain, ytrain, sample_weight=None))
Observation: R2 has decreased from 0.8568175798803241 to 0.6654808307400246 after using boosting ensemble
from sklearn.linear_model import Lasso
lassoreg = Lasso(alpha=0.001, normalize=True)
lassoreg.fit(xtrain, ytrain)
lassopredictions = lassoreg.predict(xtest)
print(sqrt(mean_squared_error(ytrain, lassoreg.predict(xtrain))))
print(sqrt(mean_squared_error(ytest, lassoreg.predict(xtest))))
r2score = r2_score(ytest, lassopredictions)
print('R2 Value/Coeff of Det: {}'.format(r2score))
print('Mean Absolute Error: {}'.format(mean_absolute_error(ytest, lassopredictions)))
mse = mean_squared_error(ytest, lassopredictions)
print("MSE: {}".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Square Error: {}".format(rmse))
from sklearn.linear_model import Ridge
ridgereg = Ridge(alpha=0.001, normalize=True)
ridgereg.fit(xtrain, ytrain)
ridgepredictions = ridgereg.predict(xtest)
print(sqrt(mean_squared_error(ytrain, ridgereg.predict(xtrain))))
print(sqrt(mean_squared_error(ytest, ridgereg.predict(xtest))))
r2score = r2_score(ytest, ridgepredictions)
print('R2 Value/Coeff of Det: {}'.format(r2score))
print('Mean Absolute Error: {}'.format(mean_absolute_error(ytest, ridgepredictions)))
mse = mean_squared_error(ytest, ridgepredictions)
print("MSE: {}".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Square Error: {}".format(rmse))
from sklearn.linear_model import ElasticNet
Elas = ElasticNet(alpha=0.001, normalize=True)
Elas.fit(xtrain, ytrain)
enetpredictions = Elas.predict(xtest)
print(sqrt(mean_squared_error(ytrain, Elas.predict(xtrain))))
print(sqrt(mean_squared_error(ytest, Elas.predict(xtest))))
r2score = r2_score(ytest, enetpredictions)
print('R2 Value/Coeff of Det: {}'.format(r2score))
print('Mean Absolute Error: {}'.format(mean_absolute_error(ytest, enetpredictions)))
mse = mean_squared_error(ytest, enetpredictions)
print("MSE: {}".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Square Error: {}".format(rmse))
Observation: No significant improvement seen over linear regression (R2 = 0.8568175) in both Lasso (R2 = 0.8568) and Ridge (R2 = 0.8568)
#read test dataset:
dftest = pd.read_csv('test_data.csv')
dftest.head()
#prep for test dataset:
#Label encode DayOfWeek and StateHoliday
encode = LabelEncoder()
dftest['StateHoliday'] = dftest['StateHoliday'] .replace([0], '0')
dftest['DayOfWeek'] = encode.fit_transform(dftest['DayOfWeek'])
#df['StateHoliday'] = encode.fit_transform(df['StateHoliday'])
print(dftest['DayOfWeek'].unique(),dftest['DayOfWeek'].dtype)
dftest['StateHoliday'].unique()
Temp = pd.get_dummies(dftest['StateHoliday'])
Temp.head()
dftest = pd.concat([dftest, Temp], axis=1)
dftest.rename(columns = {'0':'State0'}, inplace = True)
dftest.rename(columns = {'a':'Statea'}, inplace = True)
dftest.rename(columns = {'b':'Stateb'}, inplace = True)
dftest.rename(columns = {'c':'Statec'}, inplace = True)
dftest
Temp = pd.get_dummies(dftest['DayOfWeek'])
Temp.head()
dftest = pd.concat([dftest, Temp], axis=1)
dftest.rename(columns = {0:'sun'}, inplace = True)
dftest.rename(columns = {1:'mon'}, inplace = True)
dftest.rename(columns = {2:'tue'}, inplace = True)
dftest.rename(columns = {3:'wed'}, inplace = True)
dftest.rename(columns = {4:'thu'}, inplace = True)
dftest.rename(columns = {5:'fri'}, inplace = True)
dftest.rename(columns = {6:'sat'}, inplace = True)
dftest
X = dftest.drop(['Sales', 'Date'], axis=1)
Y = dftest['Sales']
lrpredictions = lg.predict(X)
#print(sqrt(mean_squared_error(ytrain, lg.predict(xtrain))))
print(sqrt(mean_squared_error(Y, lg.predict(X))))
r2score = r2_score(Y, lrpredictions)
print('R2 Value/Coeff of Det: {}'.format(r2score))
print('Mean Absolute Error: {}'.format(mean_absolute_error(Y, lrpredictions)))
#mape = mean_absolute_percentage_error(ytest.to_numpy(), predictions)
#print('MAPE: {}'.format(mape))
mse = mean_squared_error(Y, lrpredictions)
print("MSE: {}".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Square Error: {}".format(rmse))
ridgepredictions = ridgereg.predict(X)
print(sqrt(mean_squared_error(Y, ridgereg.predict(X))))
r2score = r2_score(Y, ridgepredictions)
print('R2 Value/Coeff of Det: {}'.format(r2score))
print('Mean Absolute Error: {}'.format(mean_absolute_error(Y, ridgepredictions)))
mse = mean_squared_error(Y, ridgepredictions)
print("MSE: {}".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Square Error: {}".format(rmse))
lassopredictions = lassoreg.predict(X)
print(sqrt(mean_squared_error(Y, lassoreg.predict(X))))
r2score = r2_score(Y, lassopredictions)
print('R2 Value/Coeff of Det: {}'.format(r2score))
print('Mean Absolute Error: {}'.format(mean_absolute_error(Y, lassopredictions)))
mse = mean_squared_error(Y, lassopredictions)
print("MSE: {}".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Square Error: {}".format(rmse))
Observation: Ridge gives the best R2 value (0.858337) on testdata.csv (data not seen by the model yet). This is because it avoids overfitting
a) Train a single model for all stores, where storeId can be a feature.
b) Train separate models for each store.
Note: Dimensional Reduction techniques like, PCA and Tree’s Hyperparameter Tuning will be required. Cross-validate to find the best parameters. Infer the performance of both the models.
a) Identify yearly trends and seasonal months
# Get metrics when predicted with entire dataset including closed days of stores
lg = LinearRegression()
lg.fit(xtrain,ytrain)
lrpredictions = lg.predict(xtest)
#print(sqrt(mean_squared_error(ytrain, lg.predict(xtrain))))
print(sqrt(mean_squared_error(ytest, lg.predict(xtest))))
r2score = r2_score(ytest, lrpredictions)
print('R2 Value/Coeff of Det: {}'.format(r2score))
print('Mean Absolute Error: {}'.format(mean_absolute_error(ytest, lrpredictions)))
#mape = mean_absolute_percentage_error(ytest.to_numpy(), predictions)
#print('MAPE: {}'.format(mape))
mse = mean_squared_error(ytest, lrpredictions)
print("MSE: {}".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Square Error: {}".format(rmse))
# Now let us remove stores data that were closed from both train and test
df_opendays = df[df['Open']==1]
df_opendays.head()
X_opendays = df_opendays.drop(['Sales', 'Date'], axis=1)
Y_opendays = df_opendays['Sales']
from sklearn import model_selection
xtrain_od, xtest_od, ytrain_od, ytest_od = model_selection.train_test_split(X_opendays,Y_opendays,
test_size=0.3,
random_state = 1)
# Predict using linear regression
lg_od = LinearRegression()
lg_od.fit(xtrain_od,ytrain_od)
lrpredictions_od = lg_od.predict(xtest_od)
#print(sqrt(mean_squared_error(ytrain, lg.predict(xtrain))))
print(sqrt(mean_squared_error(ytest_od, lrpredictions_od)))
r2score = r2_score(ytest_od, lrpredictions_od)
print('R2 Value/Coeff of Det: {}'.format(r2score))
print('Mean Absolute Error: {}'.format(mean_absolute_error(ytest_od, lrpredictions_od)))
mse = mean_squared_error(ytest_od, lrpredictions_od)
print("MSE: {}".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Square Error: {}".format(rmse))
Observation: there is no improvement in R2 and errors are increased after removing days where store was closed
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
print(random_grid)
{'bootstrap': [True, False],
'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5, 10],
'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(xtrain, ytrain)
rf_random.best_params_
regr = RandomForestRegressor(max_depth=3, random_state=0)
regr = regr.fit(X, Y)
predictions = regr.predict(xtest)
print(sqrt(mean_squared_error(ytest, predictions)))
r2score = r2_score(ytest, predictions)
print('R2 Value/Coeff of Det: {}'.format(r2score))
print('Mean Absolute Error: {}'.format(mean_absolute_error(ytest, predictions)))
mse = mean_squared_error(ytest, predictions)
print("MSE: {}".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Square Error: {}".format(rmse))
print(regr.score(X, Y, sample_weight=None))
Project Task: Week 3
Implementing Neural Networks:
Cluster stores using sales and customer visits as features. Find out how many clusters or groups are possible. Also visualize the results.
import pandas as pd
df = pd.read_csv('train_data.csv')
from sklearn.preprocessing import LabelEncoder
import pandas as pd
#Label encode DayOfWeek and StateHoliday
encode = LabelEncoder()
df['StateHoliday'] = df['StateHoliday'] .replace([0], '0')
df['DayOfWeek'] = encode.fit_transform(df['DayOfWeek'])
Temp = pd.get_dummies(df['StateHoliday'])
Temp.head()
df = pd.concat([df, Temp], axis=1)
df.rename(columns = {'0':'State0'}, inplace = True)
df.rename(columns = {'a':'Statea'}, inplace = True)
df.rename(columns = {'b':'Stateb'}, inplace = True)
df.rename(columns = {'c':'Statec'}, inplace = True)
Temp = pd.get_dummies(df['DayOfWeek'])
Temp.head()
df = pd.concat([df, Temp], axis=1)
df.rename(columns = {0:'sun'}, inplace = True)
df.rename(columns = {1:'mon'}, inplace = True)
df.rename(columns = {2:'tue'}, inplace = True)
df.rename(columns = {3:'wed'}, inplace = True)
df.rename(columns = {4:'thu'}, inplace = True)
df.rename(columns = {5:'fri'}, inplace = True)
df.rename(columns = {6:'sat'}, inplace = True)
df=df.drop(['StateHoliday','DayOfWeek'], axis=1)
df
#X = df.drop(['Sales', 'Date'], axis=1)
#Y = df['Sales']
#from sklearn import model_selection
#xtrain, xtest, ytrain, ytest = model_selection.train_test_split(X,Y,
# test_size=0.3,
# random_state = 1)
#Find optimal kvalue for running kmeans
df.iloc[:,3]
import matplotlib.pyplot as plt
from matplotlib import style
from sklearn.cluster import KMeans
style.use("fivethirtyeight")
#2 in df is Sales and 3 in df is Customers
plt.scatter(df.iloc[:, 2], df.iloc[:, 3], s = 30, color ='b')
# label the axes
plt.xlabel('Sales')
plt.ylabel('Customers')
plt.show()
plt.clf() # clear the figure
cost =[]
for i in range(1, 11):
KM = KMeans(n_clusters = i, max_iter = 500)
KM.fit(df[['Sales','Customers']])
# calculates squared error
# for the clustered points
cost.append(KM.inertia_)
# plot the cost against K values
plt.plot(range(1, 11), cost, color ='g', linewidth ='3')
plt.xlabel("Value of K")
plt.ylabel("Sqaured Error (Cost)")
plt.show() # clear the plot
# the point of the elbow is the
# most optimal value for choosing k
#Observation: Based on above graph 6 or 8 clusters can be created
#Now split data into clusters and in each cluster split data into train and test
from sklearn.cluster import KMeans
# Cluster using SOME columns
kmeans = KMeans(n_clusters=8, random_state=0)
kmeans = kmeans.fit(df[['Sales','Customers']])
# Save the labels
df.loc[:,'labels'] = kmeans.labels_
df['labels'].value_counts()
X = df.drop(['Sales', 'Date'], axis=1)
Y = df[['Sales', 'labels']]
from sklearn import model_selection
xtrain, xtest, ytrain, ytest = model_selection.train_test_split(X,Y,
test_size=0.3,
random_state = 1)
import csv
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import math
train_cluster = xtrain.groupby(['labels'])
test_cluster = xtest.groupby(['labels'])
train_label = ytrain.groupby(['labels'])
test_label = ytest.groupby(['labels'])
r2_avg=0
with open('cluster_wise_predictions.csv', 'w', newline='') as csvfile:
metricwriter = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL)
for i in range(0,8):
a = train_cluster.get_group(i)
b = test_cluster.get_group(i)
c = train_label.get_group(i)
d = test_label.get_group(i)
model = Lasso(alpha=0.0005)
model.fit(a,c)
predictions = model.predict(b)
r2score = r2_score(d, predictions)
r2_avg=r2_avg+r2score
mse = mean_squared_error(d, predictions)
rmse = math.sqrt(mse)
metricwriter.writerow(['Store :', i, 'R2 Value/Coeff of Det:', format(r2score), \
'Mean Absolute Error:', format(mean_absolute_error(d, predictions)), \
'MSE: ', format(mse), 'Root Mean Square Error: ', format(rmse)])
print('Average R2:')
print(r2_avg/(i+1) )
print('i:')
print(i)
#Observation: Certain clusters had good R2 values and some had low values
*Cluster R2 Value/Coeff of Det: Mean Absolute Error: MSE: Root Mean Square Error:
*0 0.988865017 2.035035729 502.2389592 22.4106885
*1 0.536374208 204.6200605 115162.1122 339.3554363
*2 0.545701627 507.5239339 756814.5271 869.9508762
*3 0.674526354 206.4369671 130559.9386 361.3307883
*4 0.531635434 334.3648209 314174.4376 560.5126561
*5 0.616587465 923.6087963 3131865.076 1769.707624
*6 0.552438686 189.4146301 100236.3415 316.6012342
*7 0.532980815 243.6738117 164896.2278 406.0741653
#Observation: Cluster 0 had best R2 score followed by 3 and 4. Rest had low scores
###### Train time series model with time as the only feature
from datetime import datetime as dt
from statsmodels.tsa.stattools import adfuller, acf, pacf
from statsmodels.tsa.arima_model import ARIMA
import math
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15, 6
import warnings
warnings.filterwarnings('ignore')
data = df[['Store', 'Date', 'Sales']]
data_store1 = data[data['Store']==1]
data_store1 = data_store1.drop(['Store'], axis=1)
data_store1['Date'] = data_store1['Date'].apply(lambda x: dt(int(x[:4]), int(x[5:7]), 15))
data_store1=data_store1.groupby('Date').sum().reset_index()
data_store1 = data_store1.set_index('Date')
ts = data_store1['Sales']
#Monthly trend across years
plt.plot(ts)
#Seasonality: Trend is upwards between months 10 and 12 and downwards between 1 and 3
import numpy as np
#transformation
ts_log = np.log(ts)
def test_stationarity(timeseries):
#Determing rolling statistics
rolmean = timeseries.rolling(window=3,center=False).mean()
rolstd = timeseries.rolling(window=3,center=False).std()
#Plot rolling statistics:
orig = plt.plot(timeseries, color='blue',label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)
#Perform Dickey-Fuller test:
print ('Results of Dickey-Fuller Test:')
dftest = adfuller(timeseries, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print (dfoutput)
test_stationarity(data_store1['Sales'])
plt.plot(ts_log)
movingAverage = ts_log.rolling(window=6).mean()
movingSTD = ts_log.rolling(window=6).std()
plt.plot(ts_log)
plt.plot(movingAverage, color='red')
# Get the difference between the moving average and the actual sales
ts_log_mv_diff = ts_log - movingAverage
ts_log_mv_diff.head(12)
#Remove Nan Values
ts_log_mv_diff.dropna(inplace=True)
ts_log_mv_diff.head(10)
test_stationarity(ts_log_mv_diff)
plt.plot(np.arange(0,11), acf(ts_log_mv_diff, nlags = 10))
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-7.96/np.sqrt(len(ts_log_mv_diff)),linestyle='--',color='gray')
plt.axhline(y=7.96/np.sqrt(len(ts_log_mv_diff)),linestyle='--',color='gray')
plt.title('Autocorrelation Function')
plt.show()
plt.plot(np.arange(0,11), pacf(ts_log_mv_diff, nlags = 10))
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-7.96/np.sqrt(len(ts_log_mv_diff)),linestyle='--',color='gray')
plt.axhline(y=7.96/np.sqrt(len(ts_log_mv_diff)),linestyle='--',color='gray')
plt.title('Partial Autocorrelation Function')
plt.show()
model = ARIMA(ts_log, order=(1, 1, 0))
results_ARIMA = model.fit(disp=-1)
plt.plot(ts_log_mv_diff)
plt.plot(results_ARIMA.fittedvalues, color='red')
plt.title('RSS: %.4f'% sum((results_ARIMA.fittedvalues[1:] - ts_log_mv_diff)**2))
predictions_ARIMA_diff = pd.Series(results_ARIMA.fittedvalues, copy=True)
predictions_ARIMA_diff.head()
predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum()
predictions_ARIMA_diff_cumsum.head()
predictions_ARIMA_log = pd.Series(ts_log.iloc[0], index=ts_log.index)
predictions_ARIMA_log = predictions_ARIMA_log.add(predictions_ARIMA_diff_cumsum,fill_value=0)
predictions_ARIMA_log.head()
predictions_ARIMA = np.exp(predictions_ARIMA_log)
plt.plot(ts)
plt.plot(predictions_ARIMA)
plt.title('RMSE: %.4f'% np.sqrt(sum((predictions_ARIMA-ts)**2)/len(ts)))
Project Task: Week 4
Applying ANN:
import pandas as pd
df = pd.read_csv('train_data.csv')
from sklearn.preprocessing import LabelEncoder
import pandas as pd
#Label encode DayOfWeek and StateHoliday
encode = LabelEncoder()
df['StateHoliday'] = df['StateHoliday'] .replace([0], '0')
df['DayOfWeek'] = encode.fit_transform(df['DayOfWeek'])
Temp = pd.get_dummies(df['StateHoliday'])
Temp.head()
df = pd.concat([df, Temp], axis=1)
df.rename(columns = {'0':'State0'}, inplace = True)
df.rename(columns = {'a':'Statea'}, inplace = True)
df.rename(columns = {'b':'Stateb'}, inplace = True)
df.rename(columns = {'c':'Statec'}, inplace = True)
Temp = pd.get_dummies(df['DayOfWeek'])
Temp.head()
df = pd.concat([df, Temp], axis=1)
df.rename(columns = {0:'sun'}, inplace = True)
df.rename(columns = {1:'mon'}, inplace = True)
df.rename(columns = {2:'tue'}, inplace = True)
df.rename(columns = {3:'wed'}, inplace = True)
df.rename(columns = {4:'thu'}, inplace = True)
df.rename(columns = {5:'fri'}, inplace = True)
df.rename(columns = {6:'sat'}, inplace = True)
df=df.drop(['StateHoliday','DayOfWeek'], axis=1)
df
X = df.drop(['Sales', 'Date'], axis=1)
Y = df['Sales']
#find important features before running ANN
from sklearn.ensemble import RandomForestRegressor
regrf = RandomForestRegressor()
rfmod = regrf.fit(X,Y)
for name, importance in zip(X.columns, rfmod.feature_importances_):
if importance > 0.01 :
print('"' + name + '"'+','+ str(importance))
X = df.drop(df.columns.difference(['Store','Customers','Promo']), 1, inplace=False)
from sklearn import model_selection
xtrain, xtest, ytrain, ytest = model_selection.train_test_split(X,Y,
test_size=0.3,
random_state = 1)
xtrain.shape
import pandas
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# define base model
def baseline_model():
# create model
model = Sequential()
model.add(Dense(13, input_dim=3, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
# Compile model
model.compile(loss='mean_squared_error', optimizer='adam')
return model
# evaluate model
estimator = KerasRegressor(build_fn=baseline_model, epochs=100, batch_size=5, verbose=0)
kfold = KFold(n_splits=10)
results = cross_val_score(estimator, X, Y, cv=kfold)
print("Baseline: %.2f (%.2f) MSE" % (results.mean(), results.std()))
X.iloc[:50,]
# Function to generate Deep ANN model
def make_regression_ann(Optimizer_trial):
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
model = Sequential()
model.add(Dense(units=5, input_dim=3, kernel_initializer='normal', activation='relu'))
model.add(Dense(units=5, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
model.compile(loss='mean_squared_error', optimizer=Optimizer_trial)
return model
###########################################
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
# Listing all the parameters to try
Parameter_Trials={'batch_size':[30,40],
'epochs':[10,20],
'Optimizer_trial':['adam', 'rmsprop']
}
# Creating the regression ANN model
RegModel=KerasRegressor(make_regression_ann, verbose=0)
###########################################
from sklearn.metrics import make_scorer
import numpy as np
# Defining a custom function to calculate accuracy
def Accuracy_Score(orig,pred):
MAPE = np.mean(100 * (np.abs(orig-pred)/orig))
print('#'*70,'Accuracy:', 100-MAPE)
return(100-MAPE)
custom_Scoring=make_scorer(Accuracy_Score, greater_is_better=True)
#########################################
# Creating the Grid search space
# See different scoring methods by using sklearn.metrics.SCORERS.keys()
grid_search=GridSearchCV(estimator=RegModel,
param_grid=Parameter_Trials,
scoring=custom_Scoring,
cv=5)
#########################################
# Measuring how much time it took to find the best params
import time
StartTime=time.time()
# Running Grid Search for different paramenters
grid_search.fit(X.iloc[0:5000,],Y.iloc[0:5000,], verbose=1)
EndTime=time.time()
print("########## Total Time Taken: ", round((EndTime-StartTime)/60), 'Minutes')
print('### Printing Best parameters ###')
grid_search.best_params_
# create ANN model
model = Sequential()
# Defining the first layer of the model
model.add(Dense(units=5, input_dim=xtrain.shape[1], kernel_initializer='normal', activation='relu'))
# Defining the Second layer of the model
model.add(Dense(units=5, kernel_initializer='normal', activation='relu'))
# The output neuron is a single fully connected node
# Since we will be predicting a single number
model.add(Dense(1, kernel_initializer='normal'))
# Compiling the model
model.compile(loss='mean_squared_error', optimizer='adam')
# Fitting the ANN to the Training set
model.fit(xtrain, ytrain ,batch_size = 30, epochs = 10, verbose=0)
y_pred = model.predict(xtest)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import math
from math import sqrt
r2score = r2_score(ytest, y_pred)
print('R2 Value/Coeff of Det: {}'.format(r2score))
print('Mean Absolute Error: {}'.format(mean_absolute_error(ytest, y_pred)))
mse = mean_squared_error(ytest, y_pred)
print("MSE: {}".format(mse))
rmse = math.sqrt(mse)
print("Root Mean Square Error: {}".format(rmse))